PBTA Unsupervised Clustering Analysis & ML Implementation¶

Author: Shehbeel Arif¶

Preclinical Laboratory Research Unit¶

The Center for Data Driven Discovery in Biomedicine (D3b)¶

Children's Hospital of Philadelphia¶

In [1]:
# Packages used for Data pre-processing, unsupervised classifications, and plotting

# Basic libraries for data pre-processing/handling
import numpy as np
import pandas as pd

# Libraries for plotting
import plotly.express as px
import matplotlib.pyplot as plt

# Library used to standardize data
from sklearn.preprocessing import MinMaxScaler

# Libraries used for unsupervised classification and clustering
# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
# K-means classificatoin
from sklearn.cluster import KMeans
# PCA clustering
from sklearn.decomposition import PCA
In [2]:
# Load in the PBTA dataset
df = pd.read_csv("pbta_mirna_tdata.csv")
df.head()
Out[2]:
class let-7a-2-3p let-7a-3p let-7a-5p let-7b-5p let-7c-3p let-7c-5p let-7d-3p let-7d-5p let-7e-3p ... miR-944 miR-95-3p miR-95-5p miR-9-5p miR-96-3p miR-96-5p miR-98-3p miR-99a-5p miR-99b-3p miR-99b-5p
0 ATRT 54 142 52379 21782 216 35044 1729 19661 296 ... 2 165 7 25001 14 1853 10 35498 422 10995
1 ATRT 136 109 39346 10494 38 16308 2132 20500 245 ... 34 51 35 1840 46 144 20 9680 443 7100
2 ATRT 13 74 39079 16633 159 30219 1614 13793 221 ... 3 27 6 5624 1 54 3 36509 312 7834
3 ATRT 28 64 19514 5273 38 9328 1003 10016 154 ... 2 307 3 4800 6 3818 3 9431 431 9542
4 ATRT 144 92 69964 20807 245 43025 1101 16204 291 ... 1 1907 0 94839 0 293 8 69583 375 11645

5 rows × 2084 columns

In [3]:
# Removing class labels to standardize data and perform unsupervised classifications
cancer_class = df["class"]
class_labels = cancer_class.to_list()
class_labels_num = [np.where(np.array(list(dict.fromkeys(class_labels)))==e)[0][0]for e in class_labels]

df = df.drop(["class"], axis=1)
df.head()
Out[3]:
let-7a-2-3p let-7a-3p let-7a-5p let-7b-5p let-7c-3p let-7c-5p let-7d-3p let-7d-5p let-7e-3p let-7e-5p ... miR-944 miR-95-3p miR-95-5p miR-9-5p miR-96-3p miR-96-5p miR-98-3p miR-99a-5p miR-99b-3p miR-99b-5p
0 54 142 52379 21782 216 35044 1729 19661 296 11693 ... 2 165 7 25001 14 1853 10 35498 422 10995
1 136 109 39346 10494 38 16308 2132 20500 245 9096 ... 34 51 35 1840 46 144 20 9680 443 7100
2 13 74 39079 16633 159 30219 1614 13793 221 8330 ... 3 27 6 5624 1 54 3 36509 312 7834
3 28 64 19514 5273 38 9328 1003 10016 154 4879 ... 2 307 3 4800 6 3818 3 9431 431 9542
4 144 92 69964 20807 245 43025 1101 16204 291 14064 ... 1 1907 0 94839 0 293 8 69583 375 11645

5 rows × 2083 columns

In [4]:
# Standardize the data
scaler = MinMaxScaler()

df[df.columns] = scaler.fit_transform(df)
df.head()
Out[4]:
let-7a-2-3p let-7a-3p let-7a-5p let-7b-5p let-7c-3p let-7c-5p let-7d-3p let-7d-5p let-7e-3p let-7e-5p ... miR-944 miR-95-3p miR-95-5p miR-9-5p miR-96-3p miR-96-5p miR-98-3p miR-99a-5p miR-99b-3p miR-99b-5p
0 0.154286 0.338902 0.143289 0.063826 0.302521 0.139854 0.297516 0.225642 0.139480 0.281512 ... 0.019417 0.036400 0.118644 0.038400 0.026718 0.005520 0.222222 0.131925 0.053069 0.336820
1 0.388571 0.260143 0.107580 0.030637 0.053221 0.064993 0.367023 0.235279 0.115366 0.218898 ... 0.330097 0.011251 0.593220 0.002809 0.087786 0.000429 0.444444 0.035788 0.055717 0.217349
2 0.037143 0.176611 0.106849 0.048687 0.222689 0.120575 0.277682 0.158235 0.104019 0.200429 ... 0.029126 0.005956 0.101695 0.008624 0.001908 0.000161 0.066667 0.135689 0.039203 0.239863
3 0.080000 0.152745 0.053244 0.015286 0.053221 0.037103 0.172301 0.114848 0.072340 0.117224 ... 0.019417 0.067726 0.050847 0.007358 0.011450 0.011373 0.066667 0.034861 0.054204 0.292252
4 0.411429 0.219570 0.191469 0.060959 0.343137 0.171743 0.189203 0.185931 0.137116 0.338678 ... 0.009709 0.420693 0.000000 0.145717 0.000000 0.000873 0.177778 0.258845 0.047145 0.356757

5 rows × 2083 columns

In [5]:
# Hierarchical Clustering
hclust = AgglomerativeClustering().fit(df)
hclust.labels_
Out[5]:
array([1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

Notes: Hierarchical clustering only classified tumor subtypes as binary

In [6]:
# K-means clustering
kmeans = KMeans(n_clusters=10, random_state=0).fit(df)
kmeans.labels_
Out[6]:
array([5, 3, 2, 0, 1, 2, 0, 1, 7, 0, 0, 1, 5, 1, 1, 2, 5, 5, 2, 1, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 5, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1,
       1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 5, 1,
       2, 1, 1, 4, 2, 1, 1, 1, 1, 1, 6, 1, 5, 1, 1, 1, 9, 9, 9, 9, 9, 9,
       9, 9, 8, 1, 1, 2, 9, 5, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 9, 9, 5, 9, 9, 5, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 3, 1, 5, 1, 1, 1, 1, 1, 2, 1,
       1, 2, 2, 1, 0, 1, 9, 9, 9, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 1, 2],
      dtype=int32)

Notes: Unsupervised classification of brain tumor samples

In [7]:
# Initialize PCA object
pca = PCA(n_components=4) # Looking at only the first four principal components

# Perform PCA on PBTA dataset
df_comps = pca.fit_transform(df)
In [8]:
# Perform PCA on PBTA dataset
df_comps = pca.fit_transform(df)

# Display PCA plot based on K-means Classification
fig = px.scatter(df_comps, x=0, y=1, color=kmeans.labels_)
fig.update_layout(title="PCA Plot based on K-means Predicted Tumor Labels", xaxis_title="PC1", yaxis_title="PC2")
fig.show()
In [9]:
# Display PCA plot based on true tumor classification
fig = px.scatter(df_comps, x=0, y=1, color=class_labels)
fig.update_layout(title="PCA Plot based on True Tumor Labels", xaxis_title="PC1", yaxis_title="PC2")
fig.show()
In [10]:
# Make 3D plot of first three components based on K-means Classification
fig = px.scatter_3d(df_comps, x=0, y=1, z=2, color=kmeans.labels_)
fig.update_layout(title="3D PCA Plot based on K-means Predicted Tumor Labels")

fig.show()
In [11]:
# Make 3D plot of first three components based on true tumor classification
fig = px.scatter_3d(df_comps, x=0, y=1, z=2, color=class_labels)
fig.update_layout(title="3D PCA Plot based on True Tumor Labels")

fig.show()

Notes:

  • Too much variation within samples
  • PCA not a good method for distinguishing tumor-type clusters possibly

Lets try reducing the number of miRNA genes used for clustering by looking at only the genes that account for the greatest variance between the samples¶

In [12]:
# Feature Selection using Variance Threshold library
from sklearn.feature_selection import VarianceThreshold

# Create function to produce filtered PBTA dataset based on selected Variance Threshold value
def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]
In [13]:
df_filtered = variance_threshold_selector(df, 0.02)
df_filtered
Out[13]:
let-7a-5p let-7c-3p let-7c-5p let-7d-3p let-7i-5p miR-100-5p miR-101-5p miR-106a-5p miR-1227-3p miR-1244 ... miR-887-5p miR-889-5p miR-92a-1-5p miR-92a-3p miR-93-5p miR-9-3p miR-944 miR-9-5p miR-99a-5p miR-99b-5p
0 0.143289 0.302521 0.139854 0.297516 0.169913 0.162352 0.028509 0.088332 0.432432 0.285301 ... 0.165049 0.333333 0.121053 0.096909 0.096866 0.009828 0.019417 0.038400 0.131925 0.336820
1 0.107580 0.053221 0.064993 0.367023 0.277541 0.085068 0.242873 0.076490 0.554054 0.079411 ... 0.330097 0.714286 0.152632 0.111613 0.085791 0.001724 0.330097 0.002809 0.035788 0.217349
2 0.106849 0.222689 0.120575 0.277682 0.128592 0.164702 0.168860 0.729255 0.209459 0.185246 ... 0.155340 0.000000 0.100000 0.334132 0.119272 0.001652 0.029126 0.008624 0.135689 0.239863
3 0.053244 0.053221 0.037103 0.172301 0.089621 0.056314 0.021930 0.423439 0.351351 0.242224 ... 0.000000 0.000000 0.631579 0.283130 0.287190 0.002740 0.019417 0.007358 0.034861 0.292252
4 0.191469 0.343137 0.171743 0.189203 0.106273 0.395581 0.071820 0.121239 0.371622 0.233141 ... 0.038835 0.023810 0.152632 0.172057 0.140428 0.056377 0.009709 0.145717 0.258845 0.356757
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
259 0.573254 0.147059 0.336218 0.123491 0.313269 0.194475 0.625548 0.246264 0.324324 0.778833 ... 0.067961 0.071429 0.247368 0.259524 0.587547 0.192195 0.038835 0.337382 0.223593 0.278480
260 0.572942 0.247899 0.512316 0.083994 0.195404 0.456029 0.690241 0.168095 0.189189 0.354803 ... 0.048544 0.047619 0.126316 0.132789 0.561155 0.076084 0.000000 0.226200 0.569656 0.229188
261 0.125296 0.047619 0.103858 0.131425 0.221471 0.088010 0.071272 0.070227 0.054054 0.030829 ... 0.000000 0.000000 0.042105 0.044081 0.024489 0.002072 0.000000 0.008190 0.066024 0.075977
262 0.210804 0.035014 0.202541 0.554502 0.293949 0.402309 0.218202 0.430442 0.067568 0.075420 ... 0.300971 0.142857 0.145614 0.259640 0.083926 0.002292 0.009709 0.001466 0.507576 0.370744
263 0.065880 0.026611 0.054980 0.089514 0.094537 0.057825 0.050439 0.033847 0.033784 0.050096 ... 0.009709 0.000000 0.012281 0.051658 0.031531 0.012591 0.000000 0.018062 0.044981 0.047880

264 rows × 194 columns

In [14]:
# Perform PCA on filtered dataset
df_filtered_comps = pca.fit_transform(df_filtered)

# Display a 3D PCA plot of the PBTA dataset using only 194 miRNA genes that account for the greatest variance
fig = px.scatter_3d(df_filtered_comps, x=0, y=1, z=2, color=class_labels) # Using three principal components
fig.show()

Using ML for miRNA gene Feature Selection¶

In [15]:
# Libraries used for Machine Learning Classification

# Library used to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Library for measuring accuracy of ML models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Machine Learning model used
from sklearn.ensemble import RandomForestClassifier
In [16]:
ml_df = df.copy()

idx = 0
ml_df.insert(loc=idx, column='class', value=cancer_class)
ml_df
Out[16]:
class let-7a-2-3p let-7a-3p let-7a-5p let-7b-5p let-7c-3p let-7c-5p let-7d-3p let-7d-5p let-7e-3p ... miR-944 miR-95-3p miR-95-5p miR-9-5p miR-96-3p miR-96-5p miR-98-3p miR-99a-5p miR-99b-3p miR-99b-5p
0 ATRT 0.154286 0.338902 0.143289 0.063826 0.302521 0.139854 0.297516 0.225642 0.139480 ... 0.019417 0.036400 0.118644 0.038400 0.026718 0.005520 0.222222 0.131925 0.053069 0.336820
1 ATRT 0.388571 0.260143 0.107580 0.030637 0.053221 0.064993 0.367023 0.235279 0.115366 ... 0.330097 0.011251 0.593220 0.002809 0.087786 0.000429 0.444444 0.035788 0.055717 0.217349
2 ATRT 0.037143 0.176611 0.106849 0.048687 0.222689 0.120575 0.277682 0.158235 0.104019 ... 0.029126 0.005956 0.101695 0.008624 0.001908 0.000161 0.066667 0.135689 0.039203 0.239863
3 ATRT 0.080000 0.152745 0.053244 0.015286 0.053221 0.037103 0.172301 0.114848 0.072340 ... 0.019417 0.067726 0.050847 0.007358 0.011450 0.011373 0.066667 0.034861 0.054204 0.292252
4 ATRT 0.411429 0.219570 0.191469 0.060959 0.343137 0.171743 0.189203 0.185931 0.137116 ... 0.009709 0.420693 0.000000 0.145717 0.000000 0.000873 0.177778 0.258845 0.047145 0.356757
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
259 Medulloblastoma 0.008571 0.121718 0.573254 0.176071 0.147059 0.336218 0.123491 0.158419 0.088416 ... 0.038835 0.019854 0.000000 0.337382 0.137405 0.177326 0.044444 0.223593 0.040590 0.278480
260 Medulloblastoma 0.008571 0.119332 0.572942 0.174848 0.247899 0.512316 0.083994 0.133262 0.071868 ... 0.000000 0.035297 0.000000 0.226200 0.015267 0.018591 0.022222 0.569656 0.026976 0.229188
261 Schwannoma 0.042857 0.054893 0.125296 0.058948 0.047619 0.103858 0.131425 0.100466 0.022222 ... 0.000000 0.018751 0.000000 0.008190 0.000000 0.000098 0.000000 0.066024 0.010967 0.075977
262 Schwannoma 0.217143 0.033413 0.210804 0.263921 0.035014 0.202541 0.554502 0.295299 0.104965 ... 0.009709 0.012795 0.016949 0.001466 0.000000 0.000018 0.066667 0.507576 0.032648 0.370744
263 Teratoma 0.008571 0.019093 0.065880 0.029196 0.026611 0.054980 0.089514 0.047384 0.013239 ... 0.000000 0.015222 0.000000 0.018062 0.000000 0.000438 0.044444 0.044981 0.005546 0.047880

264 rows × 2084 columns

In [17]:
# Split the dataset into features and labels
X = ml_df.loc[:, ml_df.columns != 'class'].values
y = ml_df.loc[:, ml_df.columns == 'class'].values.ravel()

# Sanity check
# print(X[:5])
# print(y[:5])
In [18]:
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(198, 2083) (66, 2083) (198,) (66,)
In [19]:
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)
In [20]:
# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')
Accuracy: 0.5757575757575758
In [21]:
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Tumor Type", y="Tumor Type", color="Productivity"),
                x=cancer_class.unique().tolist(),
                y=cancer_class.unique().tolist()
                )
disp.show()

Notes:

  • No 'ATRT', 'Craniopharyngioma', 'Ganglioglioma', 'GNT', 'Schwannoma', 'Teratoma' samples were included in testing set
  • LGAT consisted of 48/66 samples in the testing set used for the confusion matrix (too much!)
  • The ML model was able to classify Medulloblastoma's 8/11 (82%) accurately. Two samples were classified as ATRT, which is interesting because ATRT shares some molecular features with Medulloblastomas.
  • The ML model was able to classify Ependymoma 5/6 (83%) accurately.
  • A lot of the misclassification that exists within LGAT maybe due to the fact that the clinical samples classified as LGAT consisted of HGAT, Craniopharyngioma, and GNT subtypes that were not identified??

Finding the most important miRNAs used by ML model¶

In [22]:
# What are the most important features?
feature_list = ml_df.columns
feature_list = feature_list.drop('class')

imp_features = pd.Series(rfc.feature_importances_, index=feature_list)

imp_genes = imp_features.sort_values(ascending=False).to_frame().reset_index()
imp_genes.columns = ["features", "importance"]

imp_genes_fil = imp_genes[~(imp_genes == 0.000000).any(axis=1)]
imp_genes_fil
Out[22]:
features importance
0 miR-106b-5p 0.030923
1 miR-93-5p 0.021131
2 miR-106b-3p 0.019165
3 miR-34b-5p 0.017196
4 miR-345-5p 0.016621
... ... ...
204 miR-301a-5p 0.000539
205 miR-4999-5p 0.000534
206 miR-758-3p 0.000491
207 miR-224-3p 0.000457
208 miR-493-3p 0.000446

209 rows × 2 columns

In [23]:
# Display interactive Barplot of important miRNAs
fig = px.bar(imp_genes_fil, x=imp_genes_fil.features, y=imp_genes_fil.importance)
fig.show()

Export PBTA dataset containing only important miRNA genes¶

In [24]:
# imp_genes_fil_list = imp_genes_fil.index
# imp_genes_fil_list

# textfile = open("imp_genes.txt", "w")
# for element in imp_genes_fil_list:
#     textfile.write(element + "\n")
# textfile.close()

Looking at class imbalance in PBTA dataset¶

In [25]:
fig = px.histogram(ml_df, x="class", nbins=10)
fig.show()

Next Steps:¶

  • Balance the PBTA dataset, either by downsampling LGAT, adding more samples to other tumor types, and removing GNT, Schwannoma, and Teratoma.
  • Improve accuracy of ML model
  • Look at new list of important miRNAs and their biological significance
  • Try clustering with these new miRNAs

UMAP Projections using all miRNA genes¶

In [26]:
# Library for UMAP projections
from umap.umap_ import UMAP
In [27]:
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(df)
proj_3d = umap_3d.fit_transform(df)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=class_labels)

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=class_labels)

fig_2d.show()
fig_3d.show()

UMAP Projections using Variance Threshold filtered miRNA genes¶

In [28]:
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(df_filtered)
proj_3d = umap_3d.fit_transform(df_filtered)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=class_labels)

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=class_labels)

fig_2d.show()
fig_3d.show()

UMAP Projections using ML model selected miRNA genes¶

In [29]:
# Make a list of genes that ML model found important
ml_fil_genes = imp_genes_fil['features'].tolist()

# Create a dataframe containing only those genes
ml_fil_df = ml_df.filter(ml_fil_genes)
In [30]:
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ml_fil_df)
proj_3d = umap_3d.fit_transform(ml_fil_df)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=class_labels)

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=class_labels)

fig_2d.show()
fig_3d.show()

t-SNE Projections using all the miRNA genes¶

In [31]:
# Library for t-SNE projects
from sklearn.manifold import TSNE
In [33]:
# Initialize t-SNE object
tsne_2d = TSNE(n_components=2, random_state=0, learning_rate='auto')
tsne_3d = TSNE(n_components=3, random_state=0, learning_rate='auto')

# Fit and transform the data
tsne_proj_2d = tsne_2d.fit_transform(df)
tsne_proj_3d = tsne_3d.fit_transform(df)

tsne_fig_2d = px.scatter(tsne_proj_2d, x=0, y=1, color=class_labels)
tsne_fig_3d = px.scatter_3d(tsne_proj_3d, x=0, y=1, z=2, color=class_labels)

tsne_fig_2d.show()
tsne_fig_3d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

t-SNE Projections using Variance Threshold filtered miRNA genes¶

In [34]:
# Fit and transform the data
tsne_proj_2d = tsne_2d.fit_transform(df_filtered)
tsne_proj_3d = tsne_3d.fit_transform(df_filtered)

tsne_fig_2d = px.scatter(tsne_proj_2d, x=0, y=1, color=class_labels)
tsne_fig_3d = px.scatter_3d(tsne_proj_3d, x=0, y=1, z=2, color=class_labels)

tsne_fig_2d.show()
tsne_fig_3d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

t-SNE Projections using ML model selected miRNA genes¶

In [35]:
# Fit and transform the data
tsne_proj_2d = tsne_2d.fit_transform(ml_fil_df)
tsne_proj_3d = tsne_3d.fit_transform(ml_fil_df)

tsne_fig_2d = px.scatter(tsne_proj_2d, x=0, y=1, color=class_labels)
tsne_fig_3d = px.scatter_3d(tsne_proj_3d, x=0, y=1, z=2, color=class_labels)

tsne_fig_2d.show()
tsne_fig_3d.show()
/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

/Users/arifs2/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

Conclusion:¶

- Using the miRNA genes based on the variance threshold leads to improved clustering.¶

- Using the miRNA genes that the ML model found important in its decision-making process leads to better clustering. If we improve the ML model, then we could possibly further improve the clustering.¶

- This is a proof-of-concept of using an ML model backwards to perform feature selection. In this case, we identified 209 miRNA genes that the ML model finds significant.¶

- PCA and t-SNE plotting are not good for projecting PBTA data¶

- OVERALL: Gene filtering improves unsupervised clustering¶


In [36]:
# List of overlapping miRNA genes found from variance threshold method and ML algorithm
list(set(df_filtered.columns.tolist()) & set(imp_genes_fil['features'].tolist()))
Out[36]:
['miR-589-5p',
 'miR-125b-2-3p',
 'miR-29b-1-5p',
 'miR-431-3p',
 'miR-542-3p',
 'miR-19b-3p',
 'miR-100-5p',
 'miR-99a-5p',
 'miR-6726-3p',
 'miR-200a-3p',
 'miR-30e-5p',
 'miR-505-5p',
 'miR-19a-3p',
 'miR-200b-3p',
 'miR-4667-3p',
 'miR-32-3p',
 'miR-125b-5p',
 'miR-185-3p',
 'miR-1306-5p',
 'miR-944',
 'miR-216b-3p',
 'miR-20a-5p',
 'miR-217',
 'miR-128-3p',
 'miR-6769b-3p',
 'miR-346',
 'let-7c-5p',
 'let-7i-5p',
 'miR-615-3p',
 'miR-216b-5p',
 'miR-23a-3p',
 'miR-28-5p',
 'miR-130a-3p',
 'miR-106a-5p',
 'miR-151a-3p',
 'miR-101-5p',
 'miR-5001-3p',
 'miR-93-5p',
 'miR-3607-5p',
 'miR-624-5p',
 'miR-92a-3p',
 'miR-429',
 'miR-191-5p',
 'miR-212-3p',
 'miR-19b-1-5p',
 'miR-92a-1-5p',
 'miR-1307-3p',
 'miR-30c-2-3p',
 'miR-425-5p',
 'miR-3943',
 'miR-584-5p',
 'miR-185-5p',
 'let-7c-3p',
 'miR-200c-3p',
 'miR-17-5p',
 'miR-676-3p',
 'miR-27a-3p',
 'miR-23b-3p',
 'miR-30e-3p']

Improving the Model¶

In [39]:
df = pd.read_csv("pbta_mirna_tdata.csv")

# Split the dataset into features and labels
X = df.loc[:, df.columns != 'class'].values
y = df.loc[:, df.columns == 'class'].values.ravel()

# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=6, random_state=0)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')
(198, 2083) (66, 2083) (198,) (66,)
Accuracy: 0.803030303030303
In [40]:
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Tumor Type", y="Tumor Type", color="Productivity"),
                x=cancer_class.unique().tolist(),
                y=cancer_class.unique().tolist()
                )
disp.show()
In [41]:
# What are the most important features?
new_feature_list = df.columns
new_feature_list = new_feature_list.drop('class')

new_imp_features = pd.Series(rfc.feature_importances_, index=new_feature_list)

new_imp_genes = new_imp_features.sort_values(ascending=False).to_frame().reset_index()
new_imp_genes.columns = ["features", "importance"]

new_imp_genes_fil = new_imp_genes[~(new_imp_genes == 0.000000).any(axis=1)]
new_imp_genes_fil
Out[41]:
features importance
0 miR-106b-5p 0.014014
1 miR-216b-5p 0.011857
2 miR-25-3p 0.010249
3 miR-34b-5p 0.008669
4 miR-4423-5p 0.007628
... ... ...
1139 miR-1913 0.000067
1140 miR-513b-5p 0.000066
1141 miR-652-5p 0.000066
1142 miR-3678-3p 0.000066
1143 miR-4790-3p 0.000026

1144 rows × 2 columns

In [42]:
# Previously important genes
overlap = list(set(imp_genes_fil['features'].tolist()) & set(new_imp_genes_fil['features'].tolist()))
print(len(imp_genes_fil['features'].tolist()))
print(len(overlap))
209
203
In [45]:
# List of newly trained ML model genes
new_ml_fil_genes = new_imp_genes_fil['features'].tolist()[0:3]

# Create a dataframe containing only those genes
new_ml_fil_df = ml_df.filter(new_ml_fil_genes)

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(new_ml_fil_df)
proj_3d = umap_3d.fit_transform(new_ml_fil_df)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=class_labels)

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=class_labels)

fig_2d.show()
fig_3d.show()

Training the model on UMAP data¶

In [71]:
# Split the dataset into features and labels
X = proj_3d
y = df.loc[:, df.columns == 'class'].values.ravel()

# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=3, random_state=0)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')
(198, 3) (66, 3) (198,) (66,)
Accuracy: 0.5606060606060606

Use Variance Threshold Genes¶

In [ ]: